library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(Rtsne)
library(knitr)
Load dataset (preprocessing code in 19_10_11_create_dataset.Rmd)
clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'), row.names=1)
Remove genes without cluster (Module=gray)
Remove genes with Module-Trait correlation missing
rm_cluster = dataset[is.na(dataset$MTcor),clustering_selected] %>% unique %>% as.character
print(paste0('Removing ', sum(dataset[,clustering_selected]=='gray'), ' genes without cluster'))
## [1] "Removing 178 genes without cluster"
print(paste0('Removing ', sum(is.na(dataset$MTcor)), ' genes belonging to module(s) without module-trait correlation: ',
rm_cluster))
## [1] "Removing 1555 genes belonging to module(s) without module-trait correlation: #00BD64"
new_dataset = dataset %>% filter(dataset[,clustering_selected]!='gray' & !is.na(MTcor))
Using Module Membership variables instead of binary module membership
Not including p-value variables
Including a new variable with the absolute value of GS
Removing information from gray module (unclassified genes) and any other module that did not have a Module-Trait value
Objective variable: Binary label indicating if it’s in the SFARI dataset or not (including score 6)
new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval', clustering_selected,
gsub('#','',rm_cluster), sep='|')), MMgray)) %>%
mutate('absGS'=abs(GS), 'SFARI'=ifelse(gene.score=='None', FALSE, TRUE)) %>%
dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[!is.na(dataset$MTcor) & dataset[,clustering_selected]!='gray']
rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
print(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables.'))
## [1] "The final dataset contains 14867 observations and 38 variables."
rm(new_dataset)
Sample of the dataset
dataset %>% head %>% kable
| MTcor | GS | MM.00C1A8 | MM.4B9FFF | MM.8195FF | MM.C17FFF | MM.C49A00 | MM.D29300 | MM.FF64B2 | MM.FF699D | MM.FB61D7 | MM.00C0BB | MM.53B400 | MM.E88523 | MM.00B0F6 | MM.F365E6 | MM.FD6F86 | MM.00BBDD | MM.00B70C | MM.75AF00 | MM.DE8C00 | MM.B4A000 | MM.F8766D | MM.00B6EB | MM.00BF7D | MM.D774FD | MM.A3A500 | MM.F17E4F | MM.00C094 | MM.00A8FF | MM.00BECD | MM.A58AFF | MM.E76BF3 | MM.FF61C5 | MM.00BB45 | MM.8EAB00 | absGS | SFARI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSG00000000003 | 0.7601073 | 0.3827131 | -0.3477130 | -0.1449705 | -0.1646246 | -0.2829441 | -0.3416614 | -0.1752186 | -0.1947862 | 0.0280602 | -0.1139808 | -0.1800823 | -0.2265495 | 0.0824399 | 0.1910086 | -0.0125141 | -0.0148751 | -0.1126751 | -0.0522299 | 0.1702235 | -0.0318971 | 0.5014482 | 0.3716079 | 0.1920288 | 0.3141084 | 0.3318810 | 0.4772117 | 0.4106499 | 0.1092023 | 0.1433862 | 0.1295224 | -0.0098212 | 0.3358176 | 0.1393598 | 0.1434403 | 0.2296628 | 0.3827131 | FALSE |
| ENSG00000000419 | 0.4150643 | 0.0361671 | -0.0509414 | -0.4181807 | -0.3435142 | -0.0950710 | 0.1271936 | 0.3731103 | -0.1661217 | -0.0683150 | 0.1835545 | -0.2093919 | -0.0438694 | 0.0019727 | 0.2711652 | 0.1487585 | 0.1426498 | -0.1856460 | -0.1161695 | 0.0769049 | -0.0790248 | -0.2102113 | -0.1908434 | -0.0142087 | -0.0346279 | -0.2307758 | 0.0273226 | 0.1300158 | -0.0003214 | -0.1399217 | 0.1449922 | 0.3286803 | 0.1418381 | 0.1323354 | 0.5218180 | 0.2613671 | 0.0361671 | FALSE |
| ENSG00000000457 | -0.9216750 | -0.4005675 | 0.1874271 | 0.2806782 | 0.1875352 | 0.3747611 | 0.3393944 | 0.2716503 | 0.3227764 | 0.1136580 | 0.1760050 | 0.1146922 | 0.3195916 | -0.2275923 | -0.0641184 | -0.0820587 | 0.0331912 | 0.1632690 | -0.0397420 | -0.1388166 | -0.0426981 | -0.0552223 | -0.0629920 | -0.1494477 | -0.2855344 | -0.2286590 | -0.3060407 | -0.2607013 | -0.3293047 | -0.1707641 | -0.2804889 | -0.0996501 | -0.2359453 | -0.1367952 | -0.0969217 | -0.2679999 | 0.4005675 | FALSE |
| ENSG00000000460 | -0.1165650 | -0.2386668 | 0.0081068 | 0.0453039 | -0.1677691 | 0.1659316 | 0.3632797 | 0.4758476 | 0.1413266 | 0.0073811 | 0.0232871 | -0.0620729 | 0.0492475 | 0.1279434 | 0.2559762 | 0.1658171 | 0.2708015 | 0.0877830 | -0.0212876 | -0.1033743 | -0.0918440 | -0.2760330 | -0.2148459 | -0.1636790 | -0.3609935 | -0.4116075 | -0.4156419 | -0.2123721 | -0.2596629 | -0.3395212 | -0.0682352 | 0.1426534 | 0.0297760 | 0.1965458 | 0.3086452 | -0.2279076 | 0.2386668 | FALSE |
| ENSG00000000971 | 0.8556942 | 0.6586707 | -0.2759526 | -0.3511587 | -0.2780262 | -0.3940097 | -0.4692445 | -0.2210057 | -0.1077318 | 0.0231030 | -0.0266761 | -0.2443301 | -0.2726795 | 0.0025340 | 0.0112294 | 0.2300797 | -0.1711726 | -0.3056525 | 0.0899908 | -0.0559141 | 0.0961331 | 0.0052618 | -0.1778786 | 0.2297739 | 0.1605946 | 0.2297970 | 0.4252786 | 0.2986677 | 0.8196572 | 0.4208318 | 0.5669501 | 0.0737886 | 0.1440107 | 0.2222202 | 0.2006217 | 0.0904164 | 0.6586707 | FALSE |
| ENSG00000001036 | 0.7601073 | 0.3221038 | -0.1361035 | -0.5330410 | -0.4449867 | -0.5478868 | -0.3157576 | 0.0951756 | -0.3301118 | -0.2001291 | 0.1974429 | -0.1947359 | -0.1959688 | 0.1137417 | 0.0583008 | 0.0552634 | 0.0148300 | -0.4207842 | -0.1377539 | 0.0398521 | 0.1730645 | -0.0030739 | 0.2322611 | 0.0643902 | -0.0654919 | -0.2230414 | 0.1810246 | 0.5690748 | 0.0845592 | 0.0307338 | 0.3770384 | 0.5580772 | 0.5086895 | 0.4994903 | 0.5013863 | 0.4209643 | 0.3221038 | FALSE |
Objective variable distribution: Unbalanced labels
print(table(dataset$SFARI))
##
## FALSE TRUE
## 14039 828
cat(paste0('\n',round(mean(dataset$SFARI)*100,2), '% of the observations are positive'))
##
## 5.57% of the observations are positive
Chose the t-SNE algorithm because it preserves distances
The SFARI labels is still close to the absolute value of Gene Significance. This time the MM variables seem to be grouped in 3 clusters
tsne = dataset %>% t %>% Rtsne(perplexity=10)
plot_data = data.frame('ID'=colnames(dataset), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2],
type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=type)) + geom_point(aes(id=ID)) +
theme_minimal() + ggtitle('t-SNE visualisation of variables'))
The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and modules with low correlation far away from the SFARI tag
mtcor_by_module = original_dataset %>% dplyr::select(matches(clustering_selected), MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')
plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=MTcor)) + geom_point(aes(id=ID)) +
scale_color_viridis() + theme_minimal() +
ggtitle('t-SNE of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)
# Mean Expression data
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))
# PCA
pca = dataset %>% t %>% prcomp
plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2],
'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.7) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme(legend.position='bottom')
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')
p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(aes(alpha=alpha)) +
theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)
p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.5) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')
grid.arrange(p1, p2, p3, p4, nrow=2)
rm(pca, datExpr, datGenes, datMeta, dds, DE_info, p1, p2, p3, p4)
Should check SMOTE (Synthetic Minority Over-sampling Technique) method for later
Aiming for 1:2 ratio in labels (a bit arbitrary, but I was thinking that 1/3-2/3 doesn’t sound as imbalanced and this way I don’t have to remove that many genes or over-sample so much as with a 1:1 relation)
Under-sampling observations with negative SFARI labels: Keep 1/3 of the negative class
negative_obs = which(!dataset$SFARI)
remove_obs = sample(negative_obs, size=floor(length(negative_obs)*2/3))
dataset = dataset[-remove_obs,]
Over-sampling observations with positive SFARI label: Sample with replacement 3x original number of observations
Need to divide first into train and test sets to keep the sets independent
set.seed(123)
train_idx = sample(1:nrow(dataset), size=floor(0.8*nrow(dataset)))
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]
Sample with replacement positive observations in train set
positive_obs = which(train_set$SFARI)
add_obs = sample(positive_obs, size=2*length(positive_obs), replace=TRUE)
train_set = train_set[c(1:nrow(train_set), add_obs),]
table(train_set$SFARI)
##
## FALSE TRUE
## 3749 1971
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
train_set$SFARI = train_set$SFARI %>% as.factor
fit = glm(SFARI~., data=train_set, family='binomial')
test_set$pred = predict(fit, newdata=test_set, type='response')
Using AUC instead of accuracy to measure the performance of the model because of the imbalance between classes
pred_ROCR = prediction(test_set$pred, test_set$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(AUC,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')
lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')
SFARI genes have a slightly higher distribution than the rest
plot_data = test_set %>% dplyr::select(pred, SFARI)
plot_data %>% ggplot(aes(pred, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) +
theme_minimal() + ggtitle('Predicted score distribution by SFARI label')
Gene significance doesn’t seem to be relevant for the model
summary(fit)
##
## Call:
## glm(formula = SFARI ~ ., family = "binomial", data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7996 -0.8921 -0.6353 1.1140 2.4638
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.03145 0.05828 -17.698 < 2e-16 ***
## MTcor -0.05326 0.08141 -0.654 0.51296
## GS -1.59267 1.01431 -1.570 0.11637
## MM.00C1A8 -0.40755 0.81011 -0.503 0.61490
## MM.4B9FFF 1.35449 2.23466 0.606 0.54443
## MM.8195FF -2.55217 2.37102 -1.076 0.28175
## MM.C17FFF 5.33821 2.94619 1.812 0.07000 .
## MM.C49A00 -10.08225 3.28362 -3.070 0.00214 **
## MM.D29300 -0.11601 2.26198 -0.051 0.95910
## MM.FF64B2 -3.42565 1.55313 -2.206 0.02741 *
## MM.FF699D -0.01221 0.90475 -0.013 0.98923
## MM.FB61D7 4.78697 2.15352 2.223 0.02622 *
## MM.00C0BB -3.49686 2.95924 -1.182 0.23733
## MM.53B400 5.68630 3.39267 1.676 0.09373 .
## MM.E88523 2.30670 0.67810 3.402 0.00067 ***
## MM.00B0F6 0.74557 1.18783 0.628 0.53022
## MM.F365E6 -2.78045 3.25272 -0.855 0.39266
## MM.FD6F86 6.11707 4.16962 1.467 0.14236
## MM.00BBDD 2.28886 2.27628 1.006 0.31464
## MM.00B70C 1.59202 3.00491 0.530 0.59625
## MM.75AF00 -2.26124 1.07989 -2.094 0.03626 *
## MM.DE8C00 -0.37769 0.95407 -0.396 0.69220
## MM.B4A000 2.95619 1.66353 1.777 0.07556 .
## MM.F8766D -3.58622 1.44402 -2.483 0.01301 *
## MM.00B6EB 0.55348 1.11330 0.497 0.61908
## MM.00BF7D 3.46299 1.68429 2.056 0.03978 *
## MM.D774FD -1.00867 1.99616 -0.505 0.61335
## MM.A3A500 -3.84982 1.99995 -1.925 0.05424 .
## MM.F17E4F 4.17438 3.19947 1.305 0.19199
## MM.00C094 2.86735 1.57236 1.824 0.06821 .
## MM.00A8FF -5.46096 2.87844 -1.897 0.05780 .
## MM.00BECD 3.07719 1.89615 1.623 0.10462
## MM.A58AFF 2.32565 1.64045 1.418 0.15628
## MM.E76BF3 -0.62129 2.89972 -0.214 0.83035
## MM.FF61C5 -0.78324 1.46051 -0.536 0.59177
## MM.00BB45 -6.82551 2.58401 -2.641 0.00826 **
## MM.8EAB00 -2.25599 1.88154 -1.199 0.23052
## absGS 0.36423 0.15628 2.331 0.01977 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7367.7 on 5719 degrees of freedom
## Residual deviance: 6635.1 on 5682 degrees of freedom
## AIC: 6711.1
##
## Number of Fisher Scoring iterations: 3
There doesn’t seem to be a relation between the modules’ coefficient and their correlation with Diagnosis, although there is correlation between the variables, so it’s not correct to analyse their coefficients when this happens
Also, this results changes a lot between different runs
var_fit_info = summary(fit)$coefficients %>% as.data.frame %>%
mutate(signif=`Pr(>|z|)`<0.05, ID=rownames(summary(fit)$coefficients))
plot_data = original_dataset %>% dplyr::select(matches(clustering_selected), MTcor)
colnames(plot_data) = c('ID','MTcor')
plot_data = plot_data %>% mutate(ID=gsub('#','MM.',ID)) %>% group_by(ID, MTcor) %>%
tally %>% inner_join(var_fit_info, by='ID')
ggplotly(plot_data %>% ggplot(aes(Estimate, MTcor, color=signif, size=n)) + geom_point(aes(id=ID)) +
geom_smooth(method='lm', se=FALSE) + theme_minimal() +
xlab('Coefficient in regression') + ylab('Module-Diagnosis correlation'))
rm(var_fit_info)
Strong correlations between variables
cors = cor(train_set[,-ncol(train_set)])
heatmap.2(cors, dendrogram='none', col=brewer.pal(11,'RdBu'), scale='none', trace='none')
Print genes with highest scores in test set
test_set %>% dplyr::select(pred, SFARI) %>% mutate(ID = rownames(test_set)) %>%
arrange(desc(pred)) %>% top_n(50, wt=pred) %>% dplyr::select(ID, SFARI, pred) %>%
kable
| ID | SFARI | pred |
|---|---|---|
| ENSG00000145362 | TRUE | 0.8485268 |
| ENSG00000021645 | TRUE | 0.8131970 |
| ENSG00000100393 | TRUE | 0.8121960 |
| ENSG00000170396 | TRUE | 0.8112124 |
| ENSG00000174469 | TRUE | 0.7869432 |
| ENSG00000163625 | TRUE | 0.7784175 |
| ENSG00000126883 | FALSE | 0.7736464 |
| ENSG00000091656 | FALSE | 0.7649464 |
| ENSG00000157064 | FALSE | 0.7613582 |
| ENSG00000170579 | TRUE | 0.7572440 |
| ENSG00000116698 | FALSE | 0.7512478 |
| ENSG00000123908 | FALSE | 0.7485797 |
| ENSG00000196876 | TRUE | 0.7435254 |
| ENSG00000170448 | FALSE | 0.7427600 |
| ENSG00000086205 | TRUE | 0.7340385 |
| ENSG00000078804 | FALSE | 0.7292197 |
| ENSG00000198646 | FALSE | 0.7223111 |
| ENSG00000178385 | FALSE | 0.7177402 |
| ENSG00000119866 | TRUE | 0.7109069 |
| ENSG00000145016 | FALSE | 0.7077317 |
| ENSG00000143569 | FALSE | 0.7040398 |
| ENSG00000163171 | FALSE | 0.7028713 |
| ENSG00000187605 | FALSE | 0.7022734 |
| ENSG00000163637 | TRUE | 0.7011450 |
| ENSG00000159140 | FALSE | 0.6936414 |
| ENSG00000087460 | TRUE | 0.6907060 |
| ENSG00000196090 | TRUE | 0.6866897 |
| ENSG00000144406 | TRUE | 0.6855405 |
| ENSG00000056661 | FALSE | 0.6850533 |
| ENSG00000022840 | FALSE | 0.6833950 |
| ENSG00000197102 | TRUE | 0.6827331 |
| ENSG00000085433 | FALSE | 0.6738892 |
| ENSG00000143079 | FALSE | 0.6713677 |
| ENSG00000132326 | TRUE | 0.6695043 |
| ENSG00000122882 | FALSE | 0.6646459 |
| ENSG00000073803 | FALSE | 0.6617198 |
| ENSG00000149294 | FALSE | 0.6592623 |
| ENSG00000076356 | FALSE | 0.6573431 |
| ENSG00000101126 | TRUE | 0.6558775 |
| ENSG00000081479 | TRUE | 0.6546380 |
| ENSG00000215421 | FALSE | 0.6534893 |
| ENSG00000187391 | FALSE | 0.6531410 |
| ENSG00000185499 | FALSE | 0.6492286 |
| ENSG00000114541 | FALSE | 0.6412666 |
| ENSG00000078018 | TRUE | 0.6386647 |
| ENSG00000164506 | TRUE | 0.6368599 |
| ENSG00000115568 | FALSE | 0.6298432 |
| ENSG00000143507 | FALSE | 0.6296557 |
| ENSG00000171988 | TRUE | 0.6291181 |
| ENSG00000048471 | FALSE | 0.6279838 |